print("Loading data...")
load("02_06_m5s_join_user_activity_df.RData")
load("02_06_m5s_join_user_non_unique_names.RData")

non_unique_names <- tolower(non_unique_names)

require(stringi)
require(plyr)

print("Subsetting...")
unique_user_activity_df <- subset(join_user_activity_df, 
                                  !(is.na(join_user_activity_df$label_id)),
                                  select=c("source","label_id","name"))
                                        # Facebook labels are to be replaced with a unique label
unique_user_activity_df$source <- revalue(unique_user_activity_df$source, c("bg_facebook"="facebook", "m5s_facebook"="facebook", "pf_facebook"="facebook"))
unique_user_activity_df <- unique(unique_user_activity_df)

#print("Counting name length")
#unique_user_activity_df$ltoken <- stri_count(unique_user_activity_df$name,regex="\\S+")>1

sum_stats <- list()

sum_stats[['activities']] <- nrow(join_user_activity_df)
sum_stats[['unique_users']] <- nrow(unique_user_activity_df)
sum_stats[['unique_users_blog']] <- nrow(subset(unique_user_activity_df,  grepl('blog', source)))
sum_stats[['unique_users_forum']] <- nrow(subset(unique_user_activity_df,  grepl('forum', source)))
sum_stats[['unique_users_fb']] <- nrow(subset(unique_user_activity_df,  grepl('facebook', source)))
sum_stats[['unique_users_meetup']] <- nrow(subset(unique_user_activity_df,  grepl('meetup', source)))
sum_stats[['NA_users']] <- sum(is.na(join_user_activity_df$label_id))
sum_stats[['activity_table']] <- table(join_user_activity_df$activity)
sum_stats[['non_unique_names']] <- length(unique(non_unique_names))



# How many duplicates for each source
print("Checking duplicates")
sum_stats[['fb_duplicated']] <- sum(duplicated(subset(unique_user_activity_df,
                                                      grepl('facebook', source))$name))
sum_stats[['blog_duplicated']] <- sum(duplicated(subset(unique_user_activity_df,
                                                        grepl('blog', source))$name))
                                        # Actually blog duplicates have to be 0 by definition of variable for blog
sum_stats[['meetup_duplicated']] <- sum(duplicated(subset(unique_user_activity_df,
                                                          grepl('meetup', source))$name))
sum_stats[['forum_a_duplicated']] <- sum(duplicated(subset(unique_user_activity_df,
                                                           grepl('forum', source) &
                                                               grepl('listeciviche/author', label_id))$name))
sum_stats[['forum_b_duplicated']] <- sum(duplicated(subset(unique_user_activity_df,
                                                           grepl('forum', source) &
                                                               !(grepl('listeciviche/author', label_id)))$name))

save(sum_stats, file="02_08_m5s_join_user_sum_stats.RData")

rm(join_user_activity_df)

print("Unifying")
unique_user_activity_df$universal_id <- as.numeric(interaction(unique_user_activity_df$name, drop=TRUE))
unique_user_activity_df$universal_id <- paste0("univ", unique_user_activity_df$universal_id)

bool <- tolower(unique_user_activity_df$name) %in% non_unique_names

unique_user_activity_df$universal_id[bool] <- 
  unique_user_activity_df$label_id[bool]
unique_user_activity_df$name <- NULL

## Important universal_id might be non unique only for forum which has two author sets. Has to be like that.

save(unique_user_activity_df, file="02_08_m5s_unique_user_activity_df.RData")

